{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n",
      "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "from utils import *\n",
    "import tensorflow as tf\n",
    "from sklearn.cross_validation import train_test_split\n",
    "import time\n",
    "import copy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['negative', 'positive']\n",
      "10662\n",
      "10662\n"
     ]
    }
   ],
   "source": [
    "trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')\n",
    "trainset.data, trainset.target = separate_dataset(trainset,1.0)\n",
    "print (trainset.target_names)\n",
    "print (len(trainset.data))\n",
    "print (len(trainset.target))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "ONEHOT = np.zeros((len(trainset.data),len(trainset.target_names)))\n",
    "ONEHOT[np.arange(len(trainset.data)),trainset.target] = 1.0\n",
    "train_X, test_X, train_Y, test_Y, train_onehot, test_onehot = train_test_split(trainset.data, \n",
    "                                                                               trainset.target, \n",
    "                                                                               ONEHOT, test_size = 0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "vocab from size: 20465\n",
      "Most common words [('the', 10129), ('a', 7312), ('and', 6199), ('of', 6063), ('to', 4233), ('is', 3378)]\n",
      "Sample data [4, 657, 9, 2611, 8, 22, 4, 3483, 12390, 97] ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'centurys', 'new']\n"
     ]
    }
   ],
   "source": [
    "concat = ' '.join(trainset.data).split()\n",
    "vocabulary_size = len(list(set(concat)))\n",
    "data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)\n",
    "print('vocab from size: %d'%(vocabulary_size))\n",
    "print('Most common words', count[4:10])\n",
    "print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "GO = dictionary['GO']\n",
    "PAD = dictionary['PAD']\n",
    "EOS = dictionary['EOS']\n",
    "UNK = dictionary['UNK']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def contruct_cells(hidden_structs):\n",
    "    cells = []\n",
    "    for hidden_dims in hidden_structs: cells.append(tf.contrib.rnn.LSTMCell(hidden_dims))\n",
    "    return cells\n",
    "\n",
    "def rnn_reformat(x, input_dims, n_steps):\n",
    "    x_ = tf.transpose(x, [1, 0, 2])\n",
    "    x_ = tf.reshape(x_, [-1, input_dims])\n",
    "    return tf.split(x_, n_steps, 0)\n",
    "\n",
    "def dilated_rnn(cell, inputs, rate, scope='default'):\n",
    "    n_steps = len(inputs)\n",
    "    if not (n_steps % rate) == 0:\n",
    "        zero_tensor = tf.zeros_like(inputs[0])\n",
    "        dilated_n_steps = n_steps // rate + 1\n",
    "        for i_pad in range(dilated_n_steps * rate - n_steps): inputs.append(zero_tensor)\n",
    "    else:\n",
    "        dilated_n_steps = n_steps // rate\n",
    "    dilated_inputs = [tf.concat(inputs[i * rate:(i + 1) * rate], axis=0) for i in range(dilated_n_steps)]\n",
    "    dilated_outputs, _ = tf.contrib.rnn.static_rnn(cell, dilated_inputs, dtype=tf.float32, scope=scope)\n",
    "    splitted_outputs = [tf.split(output, rate, axis=0) for output in dilated_outputs]\n",
    "    unrolled_outputs = [output for sublist in splitted_outputs for output in sublist]\n",
    "    return unrolled_outputs[:n_steps]\n",
    "\n",
    "def multi_dilated_rnn(cells, inputs, dilations):\n",
    "    x = copy.copy(inputs)\n",
    "    for cell, dilation in zip(cells, dilations):\n",
    "        x = dilated_rnn(cell, x, dilation, scope=\"multi_dilated_rnn_%d\" % dilation)\n",
    "    return x\n",
    "\n",
    "class Model:\n",
    "    def __init__(self, steps, dict_size, dimension_input, dimension_output, learning_rate = 1e-2, \n",
    "                 hidden_structs = [20], dilations = [1, 2, 4, 8, 16, 32, 64, 128, 256]):\n",
    "        hidden_structs = hidden_structs * len(dilations)\n",
    "        self.X = tf.placeholder(tf.int32, [None, steps])\n",
    "        self.Y = tf.placeholder(tf.float32, [None, dimension_output])\n",
    "        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, dimension_input], -1, 1))\n",
    "        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)\n",
    "        x_reformat = rnn_reformat(encoder_embedded, dimension_input, steps)\n",
    "        cells = contruct_cells(hidden_structs)\n",
    "        layer_outputs = multi_dilated_rnn(cells, x_reformat, dilations)\n",
    "        if dilations[0] == 1:\n",
    "            weights = tf.Variable(tf.random_normal(shape=[hidden_structs[-1], dimension_output]))\n",
    "            bias = tf.Variable(tf.random_normal(shape=[dimension_output]))\n",
    "            self.logits = tf.matmul(layer_outputs[-1], weights) + bias\n",
    "        else:\n",
    "            weights = tf.Variable(tf.random_normal(shape=[hidden_structs[-1] * dilations[0], dimension_output]))\n",
    "            bias = tf.Variable(tf.random_normal(shape=[dimension_output]))\n",
    "            for idx, i in enumerate(range(-dilations[0], 0, 1)):\n",
    "                if idx == 0:\n",
    "                    hidden_outputs_ = layer_outputs[i]\n",
    "                else:\n",
    "                    hidden_outputs_ = tf.concat([hidden_outputs_, layer_outputs[i]],axis=1)\n",
    "            self.logits = tf.matmul(hidden_outputs_, weights) + bias\n",
    "        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))\n",
    "        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)\n",
    "        self.correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))\n",
    "        self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, tf.float32))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "embedded_size = 128\n",
    "dimension_output = len(trainset.target_names)\n",
    "maxlen = 50\n",
    "batch_size = 128"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "tf.reset_default_graph()\n",
    "sess = tf.InteractiveSession()\n",
    "model = Model(maxlen,vocabulary_size+4,embedded_size,dimension_output)\n",
    "sess.run(tf.global_variables_initializer())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 0, pass acc: 0.000000, current acc: 0.500488\n",
      "time taken: 19.016448736190796\n",
      "epoch: 0, training loss: 0.703227, training acc: 0.496922, valid loss: 0.693988, valid acc: 0.500488\n",
      "\n",
      "time taken: 24.81192111968994\n",
      "epoch: 1, training loss: 0.693919, training acc: 0.502131, valid loss: 0.694108, valid acc: 0.500488\n",
      "\n",
      "time taken: 24.935855865478516\n",
      "epoch: 2, training loss: 0.693885, training acc: 0.504735, valid loss: 0.694146, valid acc: 0.500488\n",
      "\n",
      "time taken: 24.444278478622437\n",
      "epoch: 3, training loss: 0.693864, training acc: 0.504735, valid loss: 0.694135, valid acc: 0.500488\n",
      "\n",
      "time taken: 24.814770936965942\n",
      "epoch: 4, training loss: 0.693861, training acc: 0.502131, valid loss: 0.694106, valid acc: 0.500488\n",
      "\n",
      "time taken: 24.540029525756836\n",
      "epoch: 5, training loss: 0.693863, training acc: 0.502131, valid loss: 0.694067, valid acc: 0.500488\n",
      "\n",
      "break epoch:6\n",
      "\n"
     ]
    }
   ],
   "source": [
    "EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0\n",
    "while True:\n",
    "    lasttime = time.time()\n",
    "    if CURRENT_CHECKPOINT == EARLY_STOPPING:\n",
    "        print('break epoch:%d\\n'%(EPOCH))\n",
    "        break\n",
    "        \n",
    "    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0\n",
    "    for i in range(0, (len(train_X) // batch_size) * batch_size, batch_size):\n",
    "        batch_x = str_idx(train_X[i:i+batch_size],dictionary,maxlen)\n",
    "        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], \n",
    "                           feed_dict = {model.X : batch_x, model.Y : train_onehot[i:i+batch_size]})\n",
    "        train_loss += loss\n",
    "        train_acc += acc\n",
    "    \n",
    "    for i in range(0, (len(test_X) // batch_size) * batch_size, batch_size):\n",
    "        batch_x = str_idx(test_X[i:i+batch_size],dictionary,maxlen)\n",
    "        acc, loss = sess.run([model.accuracy, model.cost], \n",
    "                           feed_dict = {model.X : batch_x, model.Y : test_onehot[i:i+batch_size]})\n",
    "        test_loss += loss\n",
    "        test_acc += acc\n",
    "    \n",
    "    train_loss /= (len(train_X) // batch_size)\n",
    "    train_acc /= (len(train_X) // batch_size)\n",
    "    test_loss /= (len(test_X) // batch_size)\n",
    "    test_acc /= (len(test_X) // batch_size)\n",
    "    \n",
    "    if test_acc > CURRENT_ACC:\n",
    "        print('epoch: %d, pass acc: %f, current acc: %f'%(EPOCH,CURRENT_ACC, test_acc))\n",
    "        CURRENT_ACC = test_acc\n",
    "        CURRENT_CHECKPOINT = 0\n",
    "    else:\n",
    "        CURRENT_CHECKPOINT += 1\n",
    "        \n",
    "    print('time taken:', time.time()-lasttime)\n",
    "    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\\n'%(EPOCH,train_loss,\n",
    "                                                                                          train_acc,test_loss,\n",
    "                                                                                          test_acc))\n",
    "    EPOCH += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "   negative       0.50      1.00      0.67      1067\n",
      "   positive       0.00      0.00      0.00      1066\n",
      "\n",
      "avg / total       0.25      0.50      0.33      2133\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    }
   ],
   "source": [
    "logits = sess.run(model.logits, feed_dict={model.X:str_idx(test_X,dictionary,maxlen)})\n",
    "print(metrics.classification_report(test_Y, np.argmax(logits,1), target_names = trainset.target_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
